In [1]:
import datetime as dt
import os
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import pandas
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer

Make vectorizer


In [26]:
def stream_lemmatized_files(corpus_dir):
    # return all docs in a dir
    user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir)
    files = os.listdir(user_dir)

    for file in files:
        filepath = os.path.join(user_dir, file)
        with open(filepath) as fo:
            #TODO rm words less the 3 chars long
            yield file[3:-4], fo.read()

In [3]:
t0 = dt.datetime.utcnow()

map_id_author = get_id_author()

df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet'])

for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'):
    author = map_id_author[_id]
    epithet = get_epithet_of_author(_id)
    df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True)

print(df.shape)
print('... finished in {}'.format(dt.datetime.utcnow() - t0))
print('Number of texts:', len(df))


(1823, 5)
... finished in 0:00:19.186495
Number of texts: 1823

In [4]:
text_list = df['text'].tolist()

# make a list of short texts to drop
# For pres, get distributions of words per doc
short_text_drop_index = [index if len(text) > 500 else None for index, text in enumerate(text_list) ]  # ~100 words

In [5]:
t0 = dt.datetime.utcnow()

# TODO: Consdier using generator to CV http://stackoverflow.com/a/21600406

# time & size counts, w/ 50 texts:
# 0:01:15 & 202M @ ngram_range=(1, 3), min_df=2, max_features=500
# 0:00:26 & 80M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=5000
# 0:00:24 & 81M @ ngram_range=(1, 2), analyzer='word', min_df=2, max_features=50000

# time & size counts, w/ 1823 texts:
# 0:02:18 & 46MB @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=500000
# 0:2:01 & 47 @ ngram_range=(1, 1), analyzer='word', min_df=2, max_features=1000000

# max features in the lemmatized data set: 551428
max_features = 100000
ngrams = 1
vectorizer = CountVectorizer(ngram_range=(1, ngrams), analyzer='word', 
                             min_df=2, max_features=max_features)
term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

# save matrix
vector_fp = os.path.expanduser('~/cltk_data/user_data/vectorizer_test_features{0}_ngrams{1}.pickle'.format(max_features, ngrams))
joblib.dump(term_document_matrix, vector_fp)

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


... finished in 0:01:28.154103

Transform term matrix into feature table


In [6]:
# Put BoW vectors into a new df
term_document_matrix = joblib.load(vector_fp)  # scipy.sparse.csr.csr_matrix

In [7]:
term_document_matrix.shape


Out[7]:
(1823, 100000)

In [8]:
term_document_matrix_array = term_document_matrix.toarray()

In [9]:
dataframe_bow = pandas.DataFrame(term_document_matrix_array, columns=vectorizer.get_feature_names())

In [10]:
ids_list = df['id'].tolist()

In [11]:
len(ids_list)


Out[11]:
1823

In [12]:
dataframe_bow.shape


Out[12]:
(1823, 100000)

In [13]:
dataframe_bow['id'] = ids_list

In [14]:
authors_list = df['author'].tolist()
dataframe_bow['author'] = authors_list

In [15]:
epithets_list = df['epithet'].tolist()
dataframe_bow['epithet'] = epithets_list

In [16]:
# For pres, give distribution of epithets, including None
dataframe_bow['epithet']


Out[16]:
0                  Historici/-ae
1                        Tragici
2                        Tragici
3                         Comici
4                           None
5                           None
6                  Historici/-ae
7               Philosophici/-ae
8                      Sophistae
9                     Theologici
10                 Historici/-ae
11      Scriptores Ecclesiastici
12                     Geographi
13                    Periegetae
14                          None
15                    Lyrici/-ae
16              Philosophici/-ae
17                       Tragici
18                          None
19                     Geographi
20                          None
21                        Medici
22                 Historici/-ae
23                 Historici/-ae
24                        Medici
25                    Lyrici/-ae
26                  Onirocritici
27                Paradoxographi
28      Scriptores Ecclesiastici
29                       Tragici
                  ...           
1793               Historici/-ae
1794               Historici/-ae
1795               Historici/-ae
1796                        None
1797               Historici/-ae
1798           Epigrammatici/-ae
1799                        None
1800            Philosophici/-ae
1801            Philosophici/-ae
1802                    Elegiaci
1803                  Lyrici/-ae
1804                     Iambici
1805                 Alchemistae
1806            Philosophici/-ae
1807            Philosophici/-ae
1808                      Comici
1809                      Comici
1810            Philosophici/-ae
1811                  Lyrici/-ae
1812                   Sophistae
1813                   Epici/-ae
1814            Philosophici/-ae
1815            Philosophici/-ae
1816               Historici/-ae
1817                 Astronomici
1818            Philosophici/-ae
1819                  Lyrici/-ae
1820               Historici/-ae
1821                        None
1822                      Comici
Name: epithet, dtype: object

In [21]:
t0 = dt.datetime.utcnow()

# removes 334
#! remove rows whose epithet = None
# note on selecting none in pandas: http://stackoverflow.com/a/24489602
dataframe_bow = dataframe_bow[dataframe_bow.epithet.notnull()]
dataframe_bow.shape

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


... finished in 0:00:02.298707

In [22]:
t0 = dt.datetime.utcnow()

dataframe_bow.to_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))

print('... finished in {}'.format(dt.datetime.utcnow() - t0))


... finished in 0:05:07.627325

In [23]:
dataframe_bow.shape


Out[23]:
(1489, 100003)

In [24]:
dataframe_bow.head(10)


Out[24]:
ʹʹ ʹγʹ ʹδʹ αʹ ααα ααπτος ααπτους ααρων αασαμην αασχετον ... ϲωμα ϲωματα ϲωματι ϲωματοϲ ϲωματων ϲωμαϲι ϲωμαϲιν id author epithet
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1459 Lepidus Hist. Historici/-ae
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0825 Melito Trag. Tragici
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0331 [Polyidus] Trag. Tragici
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0417 Archippus Comic. Comici
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2475 Menecrates Hist. Historici/-ae
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 4075 Marinus Phil. Philosophici/-ae
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2127 Troilus Soph. Sophistae
9 0 0 0 0 0 0 0 4 0 0 ... 0 0 0 0 0 0 0 2074 Apollinaris Theol. Theologici
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2173 Antileon Hist. Historici/-ae
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1419 Hermas Scr. Eccl., Pastor Hermae Scriptores Ecclesiastici

10 rows × 100003 columns


In [25]:
# write dataframe_bow to disk, for fast reuse while classifying
# 2.3G
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
joblib.dump(dataframe_bow, fp_df)


Out[25]:
['/root/cltk_data/user_data/tlg_bow_df.pickle']